{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "4ac4e52d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default\n",
      "Reusing dataset chn_senti_corp (/root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "69e06369a3c846ac9a793a176ab028c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 9600\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 1200\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 1200\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "#加载数据\n",
    "#注意：如果你的网络不允许你执行这段的代码，则直接运行【从磁盘加载数据】即可，我已经给你准备了本地化的数据文件\n",
    "dataset = load_dataset(path='seamew/ChnSentiCorp')\n",
    "\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e278c340",
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存数据集到磁盘\n",
    "#注意：运行这段代码要确保【加载数据】运行是正常的，否则直接运行【从磁盘加载数据】即可\n",
    "dataset.save_to_disk(dataset_dict_path='./data/ChnSentiCorp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5b623868",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 9600\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 1200\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 1200\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#从磁盘加载数据\n",
    "from datasets import load_from_disk\n",
    "\n",
    "dataset = load_from_disk('./data/ChnSentiCorp')\n",
    "\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8d1240ba",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#取出训练集\n",
    "dataset = dataset['train']\n",
    "\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1449e9e2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',\n",
       " 'label': 1}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看一个数据\n",
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "6f179e2b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1, 0, 0, 1, 0, 0, 0, 1, 1]\n",
      "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n"
     ]
    }
   ],
   "source": [
    "#sort\n",
    "\n",
    "#未排序的label是乱序的\n",
    "print(dataset['label'][:10])\n",
    "\n",
    "#排序之后label有序了\n",
    "sorted_dataset = dataset.sort('label')\n",
    "print(sorted_dataset['label'][:10])\n",
    "print(sorted_dataset['label'][-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0580b95d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[0, 1, 0, 0, 1, 0, 1, 0, 1, 0]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#shuffle\n",
    "\n",
    "#打乱顺序\n",
    "shuffled_dataset = sorted_dataset.shuffle(seed=42)\n",
    "\n",
    "shuffled_dataset['label'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "4b645946",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 6\n",
       "})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#select\n",
    "dataset.select([0, 10, 20, 30, 40, 50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "2a1fd2a0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "05c2a46c464c4a819c2bf9ae83634ae9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/10 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(2,\n",
       " ['选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',\n",
       "  '选择的事例太离奇了，夸大了心理咨询的现实意义，让人失去了信任感！如果说这样写的效果能在一开始抓住读者的眼球，但是看到案例主人公心理问题的原因解释时就逐渐失去了兴趣，反正有点拣了芝麻丢了西瓜的感觉。'])"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#filter\n",
    "def f(data):\n",
    "    return data['text'].startswith('选择')\n",
    "\n",
    "\n",
    "start_with_ar = dataset.filter(f)\n",
    "\n",
    "len(start_with_ar), start_with_ar['text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "90d50fce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 8640\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 960\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#train_test_split, 切分训练集和测试集\n",
    "dataset.train_test_split(test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b7563dea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 2400\n",
       "})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#shard\n",
    "#把数据切分到4个桶中,均匀分配\n",
    "dataset.shard(num_shards=4, index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "71352b68",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['textA', 'label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#rename_column\n",
    "dataset.rename_column('text', 'textA')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "408abc59",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#remove_columns\n",
    "dataset.remove_columns(['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "252d3de3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /Users/lee/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/cache-89c77711719ee85f.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['My sentence: 选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',\n",
       " 'My sentence: 15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错',\n",
       " 'My sentence: 房间太小。其他的都一般。。。。。。。。。',\n",
       " 'My sentence: 1.接电源没有几分钟,电源适配器热的不行. 2.摄像头用不起来. 3.机盖的钢琴漆，手不能摸，一摸一个印. 4.硬盘分区不好办.',\n",
       " 'My sentence: 今天才知道这书还有第6卷,真有点郁闷:为什么同一套书有两种版本呢?当当网是不是该跟出版社商量商量,单独出个第6卷,让我们的孩子不会有所遗憾。']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#map\n",
    "def f(data):\n",
    "    data['text'] = 'My sentence: ' + data['text']\n",
    "    return data\n",
    "\n",
    "\n",
    "datatset_map = dataset.map(f)\n",
    "\n",
    "datatset_map['text'][:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "56ff9517",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': tensor(1)}"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#set_format\n",
    "dataset.set_format(type='torch', columns=['label'])\n",
    "\n",
    "dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "de71789e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default\n",
      "Reusing dataset chn_senti_corp (/root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eb5bba4bfe3845409af7cfa41053db11",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-5c7b6878c4d0b4ab\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5c7b6878c4d0b4ab/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e7906d8664b847aea31a94607caec7fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fdd0d791510847ebb65fd96bd2aa7047",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5c7b6878c4d0b4ab/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'Unnamed: 0': 20, 'text': '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', 'label': 1}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第三章/导出为csv格式\n",
    "dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')\n",
    "dataset.to_csv(path_or_buf='./data/ChnSentiCorp.csv')\n",
    "\n",
    "#加载csv格式数据\n",
    "csv_dataset = load_dataset(path='csv',\n",
    "                           data_files='./data/ChnSentiCorp.csv',\n",
    "                           split='train')\n",
    "csv_dataset[20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7da34a2d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default\n",
      "Reusing dataset chn_senti_corp (/root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "19c4f575a49d4eec8029424edb8685a5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-68e5684be5cee328\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-68e5684be5cee328/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9110099fc1f94574a0fddadf6b3fd144",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d47234c67f5449d189bbefb07f260ca1",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-68e5684be5cee328/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'text': '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', 'label': 1}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第三章/导出为json格式\n",
    "dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')\n",
    "dataset.to_json(path_or_buf='./data/ChnSentiCorp.json')\n",
    "\n",
    "#加载json格式数据\n",
    "json_dataset = load_dataset(path='json',\n",
    "                            data_files='./data/ChnSentiCorp.json',\n",
    "                            split='train')\n",
    "json_dataset[20]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
